#!/usr/bin/env python
# -*- coding:utf-8 -*-

'''lxml操作学习

小结：
    lxml.html.fromstring 将文本转换成 lxml.etree._Element
    element.xpath() 使用xpath查找节点 节点像字典
    element.attrib 节点属性字典
    element.text_content() 得到节点下所有文本内容
    

历史:
    2017-02-27 姚彧 创建
'''

from lxml import etree
from lxml import html
from helper.file_helper import * 

def html_body_text_content(root_el):
    assert (isinstance(root_el, etree._Element))
    body_el = root_el.xpath('//html/body')[0][-1]
    return body_el.text_content().replace('\xa0', ' ')

if __name__ == '__main__':
    def main():
        src_file = r'Z:\HLM\html\挡土墙0113.xls.html'
        content = yy_file_read_content(src_file)
        
        root = html.fromstring(content)
        assert (isinstance(root, etree._Element))
        content = html_body_text_content(root)
        print(content.split('\n'))
        #root = etree.HTML(content)
        #for element in root.xpath('//div/div'):
            #for item in element:
                #print(item.tag, item.text_content())
            #break
        
        
    main()
